Lab11

Confidence Intervals for Z-test

Example 1: C.I for Z-test

n = 30 # sample size

alpha = .90 # Desired coverage probability

x.1 <- rnorm(n) # Make a single sample
#z.1 = qnorm((1 + alpha)/2, lower.tail = T)
z.1 = qnorm((1 - alpha)/2, lower.tail = F) # find z-star
z.1
## [1] 1.644854
qnorm((1 + alpha)/2) # alternative formula 
## [1] 1.644854
# Make a 90% CI for the mean

c(mean(x.1) - z.1*1/sqrt(n), mean(x.1) + z.1*1/sqrt(n))
## [1] -0.54371561  0.05690001

t-C.Is

Example 2:

# Make a t-confidence interval "by hand"

xbar = 185 # given in problem
SD = 45 # given in problem
n = 40 # sample size given in problem

SE = SD/sqrt(n)

qt(.95,n-1)
## [1] 1.684875
qnorm(.95)
## [1] 1.644854
L = xbar - qt(.95,39)*SE
U = xbar + qt(.95,39)*SE

# Here is the CI:

L
## [1] 173.0119
U
## [1] 196.9881

Example 3:

Make a single CI, using the t.test() function in R

# Sample size n = 10

n = 10 # sample size

# Make a single CI, using the t.test() function in R 

x.2 <- rnorm(n)

t.test(x.2) -> mytest.95  # default confidence level = 0.95
mytest.95
## 
##  One Sample t-test
## 
## data:  x.2
## t = -0.69429, df = 9, p-value = 0.505
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -1.0471856  0.5553425
## sample estimates:
##  mean of x 
## -0.2459216
names(mytest.95)
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"
mytest.95$method
## [1] "One Sample t-test"
mytest.95$statistic
##          t 
## -0.6942945
# Extract the CI:

mytest.95$conf.int # this is the CI
## [1] -1.0471856  0.5553425
## attr(,"conf.level")
## [1] 0.95
# Redo with confidence level 90%:

t.test(x.2, conf.level = .9) -> mytest.9
mytest.9
## 
##  One Sample t-test
## 
## data:  x.2
## t = -0.69429, df = 9, p-value = 0.505
## alternative hypothesis: true mean is not equal to 0
## 90 percent confidence interval:
##  -0.8952166  0.4033735
## sample estimates:
##  mean of x 
## -0.2459216

Example 4:

Now simulate 10000 t-based CIs, confidence level = 95%

N = 10000 # Number of samples for the simulation
gamma = .95 # Confidence level = desired coverage probability
n = 5 # Sample size

mydf.3 = data.frame(L = rep(NA,N), U = rep(NA,N))
for (j in 1:N){
 x.3 <- rnorm(n)
 mytest <- t.test(x.3, conf.level = gamma)
 mydf.3$L[j] = mytest$conf.int[1]
 mydf.3$U[j] = mytest$conf.int[2]
}

# Find fraction of cases where the CI is too far to the right: 

mean(mydf.3$L > 0) # proability that the CI is too far to the right
## [1] 0.028
# Find fraction of cases where the CI is too far to the left: 

mean(mydf.3$U < 0) # proability that the CI is too far to the left
## [1] 0.0226
# Probabilities add to about 5%.  

You can see the coverage is pretty good. It is early 0.025 from both sides. Note that we are sampling form a normal distribution.

Example 5:

Repeat this with samples of size 5 from an exponential distribution.

gamma = .95 # Coverage probability
n = 5 # Sample size
mydf.4 = data.frame(L = rep(NA,N), U = rep(NA,N))

for (j in 1:N){
 x.4 <- rexp(n)
 mytest.4 <- t.test(x.4, conf.level = gamma)
 mydf.4$L[j] = mytest.4$conf.int[1]
 mydf.4$U[j] = mytest.4$conf.int[2]
}

mean(mydf.4$L > 1) #lambda is 1
## [1] 0.0046
mean(mydf.4$U < 1) #too far to the left = 11%
## [1] 0.1173

The coverage probability is much less than 95%. Too many confidence intervals are too far to the left to contain the correct value.

Segment plot to examine this:

plot(x = c(-2, 4), y = c(1, 100), type = "n", xlab = "",
  ylab = "",
  main = "100 CI's, n=5, exponential, gamma = .95")
for (j in 1:100){
  segments(mydf.4$L[j], j, mydf.4$U[j], j)
}
abline(v = 1, lwd = 2, col = 2)

Most of the C.I’s are large and a little bit off.

Example 6:

Repeat with larger sample size.

n = 50 #larger sample size
for (j in 1:N){
  x.4 <- rexp(n)
  mytest.4 <- t.test(x.4, conf.level = gamma)
  mydf.4$L[j] = mytest.4$conf.int[1]
  mydf.4$U[j] = mytest.4$conf.int[2]
}

mean(mydf.4$L > 1)
## [1] 0.01
mean(mydf.4$U < 1)
## [1] 0.0548
# This is better but still too high. it should be 5% but it's
#a littlr bit higher than 5

# Segment plot

plot(x = c(0, 2), y = c(1, 100), type = "n", xlab = "",
 ylab = "",
 main = "100 CI's, n = 50, exponential, gamma = .95")

for (j in 1:100){
   segments(mydf.4$L[j], j, mydf.4$U[j], j)
}
abline(v = 1, lwd = 2, col = 2)

Example 7: Now do this for a uniform distribution.

This is a symmetric distribution with only one peak. the t- CI give good coverage already for small sample sizes.

gamma = .95 # Coverage probability
n = 10 # Sample size
mydf.5 <- mydf.4
for (j in 1:N){
  x.5 <- runif(n)
  mytest.5 <- t.test(x.5, conf.level = gamma)
  mydf.5$L[j] = mytest.5$conf.int[1]
  mydf.5$U[j] = mytest.5$conf.int[2]
}

mean(mydf.5$L > .5) #2.7%
## [1] 0.0252
mean(mydf.5$U < .5)#2.7% so all together about 5.5% = acceptable
## [1] 0.027
# Good coverage probability.

Let’s confirmed it by a segment plot of 100 CIs.

plot(x = c(0, 1), y = c(1, 100), type = "n", xlab = "",
  ylab = "",
  main = "100 CI's, n = 10, uniform, gamma = .95")

for (j in 1:100){
  segments(mydf.5$L[j], j, mydf.5$U[j], j)
}
abline(v = .5, lwd = 2, col = 2)

Example 8: Using actual data “Girls2004.csv”

Girls2004 <- read.csv("Girls2004.csv", stringsAsFactors=FALSE)
head(Girls2004)
##   ID State MothersAge Smoker Weight Gestation
## 1  1    WY      15-19     No   3085        40
## 2  2    WY      35-39     No   3515        39
## 3  3    WY      25-29     No   3775        40
## 4  4    WY      20-24     No   3265        39
## 5  5    WY      25-29     No   2970        40
## 6  6    WY      20-24     No   2850        38
# Explore dependence of Weight on state and smoking history.

boxplot(Weight ~ State, data = Girls2004)

boxplot(Weight ~ Smoker, data = Girls2004)

boxplot(Weight ~ Smoker*State, data = Girls2004)

library(ggplot2)

(MyL1<-ggplot(Girls2004, aes(x=State, y=Weight,fill=State))+
    geom_boxplot()+
    geom_jitter(position=position_jitter(.01), aes(color=State))+
  ggtitle("Weights of new born Girls by State"))

p<-ggplot(Girls2004, aes(x=State, y=Weight, fill=State))+ geom_boxplot()

# Extract data from the two states

girlsWY <- Girls2004$Weight[Girls2004$State == "WY"]
girlsAK <- Girls2004$Weight[Girls2004$State == "AK"]

Make a CI for the mean weight of girl babies in Wyoming and Alaska

mytest <- t.test(girlsWY, conf.level = .95, alternative = "two.sided")
mytest
## 
##  One Sample t-test
## 
## data:  girlsWY
## t = 48.5, df = 39, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  3074.115 3341.685
## sample estimates:
## mean of x 
##    3207.9
mytest$conf.int
## [1] 3074.115 3341.685
## attr(,"conf.level")
## [1] 0.95

We are 95% confident that the average weight of girl babies in Wyoming is between 3074 grams and 3341 grams.

Same thing for data from Alaska

mytest.AK <- t.test(girlsAK, conf.level = .95,
                      alternative = "two.sided")
mytest.AK
## 
##  One Sample t-test
## 
## data:  girlsAK
## t = 38.421, df = 39, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  3331.23 3701.47
## sample estimates:
## mean of x 
##   3516.35

We are 95% confident that the average weight of girl babies in Alaska is between 3331 grams and 3701 grams.

Make a CI for the difference of means.

mytest.a <- t.test(girlsAK, girlsWY,
    conf.level = .9,
    alternative = "two.sided")

mytest.a
## 
##  Welch Two Sample t-test
## 
## data:  girlsAK and girlsWY
## t = 2.7316, df = 71.007, p-value = 0.007946
## alternative hypothesis: true difference in means is not equal to 0
## 90 percent confidence interval:
##  120.2575 496.6425
## sample estimates:
## mean of x mean of y 
##   3516.35   3207.90
# Note the Welch estimate for the degrees of freedom 
# which results in a non-integer value.  

We are 95% confident that the average weight of girl babies in Arkansas is higher 120 g to 496 g than that is in Wyoming.

Alternative way to construct the test:

mytest.b <- t.test(Weight ~ State, data = Girls2004, conf.level = .9) #90% C.I
mytest.b
## 
##  Welch Two Sample t-test
## 
## data:  Weight by State
## t = 2.7316, df = 71.007, p-value = 0.007946
## alternative hypothesis: true difference in means between group AK and group WY is not equal to 0
## 90 percent confidence interval:
##  120.2575 496.6425
## sample estimates:
## mean in group AK mean in group WY 
##          3516.35          3207.90
# Change the confidence level:

mytest.b <- t.test(Weight ~ State, data = Girls2004, conf.level = .95) #95% C.I
mytest.b
## 
##  Welch Two Sample t-test
## 
## data:  Weight by State
## t = 2.7316, df = 71.007, p-value = 0.007946
## alternative hypothesis: true difference in means between group AK and group WY is not equal to 0
## 95 percent confidence interval:
##   83.29395 533.60605
## sample estimates:
## mean in group AK mean in group WY 
##          3516.35          3207.90

One Sided C.I

Import the data set Titanic.csv which contains survival data (0 = death, 1 = survival) and ages of 658 passengers of the Titanic which sank on April 15, 1912. Examine the null hypothesis that the mean ages of survivors and of victims are the same against the alternative that these mean ages are different, using a t-test.

Compute the confidence Interval for the true mean difference?

Import the data. There are 135 survivors and 523 victims in this data set. We can make a side by side boxplot and see that the age distributions of survivors and victims are very similar, but survivors have a slightly smaller median. The mean age of survivors is 26.98 and the mean age of victims is 31.52. The difference is 4.54.

 Titanic <- read.csv("Titanic.csv", stringsAsFactors=FALSE) 
head(Titanic)
##   ID Survived   Age
## 1  1        1  0.92
## 2  2        0 30.00
## 3  3        1 48.00
## 4  4        0 39.00
## 5  5        0 71.00
## 6  6        0 47.00
boxplot(Age ~ Survived, data = Titanic)

#let's look at the means
agg.df <- aggregate(Age ~ Survived, data = Titanic, FUN = mean)
print(agg.df)
##   Survived      Age
## 1        0 31.51641
## 2        1 26.97778
(mean.diff <- agg.df[1,2] - agg.df[2,2])
## [1] 4.538628

Let \(\mu_S =\) population mean age of survivors and \(\mu_v =\) population mean age of victims.

The null hypothesis is \(H_0: \mu_V - \mu_S = 0\) and the alternative is \(H_a: \mu_V - \mu_S > 0\).

survivors <- subset(Titanic, select=Age, subset=Survived=="1", drop=T)
victims <- subset(Titanic, select=Age,subset=Survived=="0", drop=T)

t.test(victims, survivors, alt="greater")
## 
##  Welch Two Sample t-test
## 
## data:  victims and survivors
## t = 3.091, df = 191.92, p-value = 0.001146
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  2.111742      Inf
## sample estimates:
## mean of x mean of y 
##  31.51641  26.97778
###or

t.test(Age ~ Survived, data = Titanic,alternative = "greater")$conf
## [1] 2.111742      Inf
## attr(,"conf.level")
## [1] 0.95

With \(95\%\) confidence it can be concluded that the true mean age of victims is at least 2 years more than the mean age of survivors.

Comparison of Bootstrap CI and t-based CI

Example 9: Use the Verizon data

bootsrap C.I

Verizon <- read.csv("Verizon.csv", stringsAsFactors=FALSE)
head(Verizon)
##    Time Group
## 1 17.50  ILEC
## 2  2.40  ILEC
## 3  0.00  ILEC
## 4  0.65  ILEC
## 5 22.23  ILEC
## 6  1.20  ILEC
# Separate the two groups.

VerizonI <- Verizon$Time[Verizon$Group == "ILEC"]
VerizonC <- Verizon$Time[Verizon$Group == "CLEC"]

# First look at the ILEC data (sample size = 1664)

# Make a 95% CI using the bootstrap for verizon customers

boot.mean.VerizonI <- replicate(10000, mean(sample(VerizonI, 
                                                   replace = T)))

quantile(boot.mean.VerizonI, c(.025, .975))
##     2.5%    97.5% 
## 7.716504 9.138038
# Bootstrap: for customers from other companies

boot.mean.VerizonC <- replicate(10000, mean(sample(VerizonC, 
                                                   replace = T)))

quantile(boot.mean.VerizonC, c(.025, .975))
##     2.5%    97.5% 
## 10.12642 25.62305

Make a 95% CI for the mean repair time using t.test

t.test(VerizonI, conf.level = .95)$conf.int
## [1] 7.705276 9.117945
## attr(,"conf.level")
## [1] 0.95
# The two CIs are essentially the same (large sample size)

# Now do this for the CLEC data.


t.test(VerizonC, conf.level = .95)$conf.int
## [1]  8.075152 24.943109
## attr(,"conf.level")
## [1] 0.95

Remember that the sample size was large for their own customers(1664) but very low sample size for customers from other companies(23). Therefore, it is questionable to use t-test for the CLEC data.

So Which one should we use?

Example 10: Which one should we believe?

Compare bootstrap CI and t.test CI for data from a known distribution that is skewed.

  • Exponential distribution, lambda = 1, sample size = 10.

  • Replicate CIs for the pop. mean for 1000 samples, with confidence level 95%.

# Fraction of CIs that contains 1 = simulated coverage probability.
# Put everything in one data frame.

mydf.6 <- data.frame(L.t = rep(NA,1000))
mydf.6$U.t <- NA
mydf.6$L.boot <- NA
mydf.6$U.boot <- NA

# make 1000 samples, 
# find t.test CI for case,
# make bootstrap CI for each case,

n = 10
alpha = .9
for (j in 1:1000){
  x.0 <- rexp(n)
  mydf.6[j,1:2] <- t.test(x.0, conf.level = alpha)$conf.int
  boot.mean <- replicate(10000, mean(sample(x.0, replace = T)))
  mydf.6[j,3:4] <- quantile(boot.mean,c((1-alpha)/2, (1+alpha)/2))
}

hist(boot.mean, breaks = 40)

head(mydf.6)
##         L.t      U.t    L.boot   U.boot
## 1 0.2372943 1.116045 0.3238111 1.076931
## 2 0.3470579 1.689544 0.4762477 1.629433
## 3 0.8799204 2.342777 1.0403411 2.263653
## 4 0.3944768 2.503756 0.6287254 2.428744
## 5 0.1087424 1.147443 0.2303944 1.106497
## 6 0.4996889 1.404923 0.5868083 1.354018
mydf.6$t.cover <- (mydf.6$L.t < 1 & mydf.6$U.t > 1)
mydf.6$boot.cover <- (mydf.6$L.boot < 1 & mydf.6$U.boot > 1)

mean(mydf.6$t.cover) #84% of the time it is in the interval
## [1] 0.862
mean(mydf.6$boot.cover) #80% of the time it it in the interval
## [1] 0.827

Still it looks like there is more coverage from t-test.

CI for Proportions for one sample

Example 11

Gallup poll data of October 2017: Observe 134 successes (Yes answers) in 1028 trials (interviews). Gallup poll data of October 2018: Observe 214 successes (Yes answers) in 1021 trials (interviews).

n.2017 = 1028
yes.2017 = 134

prop.test(yes.2017, n.2017)
## 
##  1-sample proportions test with continuity correction
## 
## data:  yes.2017 out of n.2017, null probability 0.5
## X-squared = 560.39, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.1106849 0.1528326
## sample estimates:
##         p 
## 0.1303502
# Same thing for 214 successes in 1021 trials:

n.2018 = 1021
yes.2018 = 214
prop.test(yes.2018, n.2018)$conf.int # R computes score intervals.
## [1] 0.1852772 0.2361393
## attr(,"conf.level")
## [1] 0.95

Example 12

Compute Wald confidence intervals by hand for comparison.

p.hat = yes.2017/n.2017
SE = sqrt(   p.hat*(1-p.hat)/n.2017   )
zstar = qnorm(.975)
c(p.hat - zstar*SE, p.hat + zstar*SE)
## [1] 0.1097686 0.1509318
prop.test(yes.2017, n.2017)$conf.int
## [1] 0.1106849 0.1528326
## attr(,"conf.level")
## [1] 0.95

CI for the difference of two proportions.

prop.test(c(yes.2018,yes.2017),c(n.2018,n.2017))
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(yes.2018, yes.2017) out of c(n.2018, n.2017)
## X-squared = 22.258, df = 1, p-value = 2.383e-06
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.04591606 0.11258042
## sample estimates:
##    prop 1    prop 2 
## 0.2095984 0.1303502
# The CI does not contains 0, so there is evidence from the CI
# that the two proportions are different.

# We can also perform a chi squared test to find out 
# if the proportion of successes is the same in both populations.
# We must use the number of success and the number of failures to do this,
# not the number of successes and the number of trials.

chisq.test(matrix(c(yes.2018,yes.2017,
                    n.2018 - yes.2018,n.2017 - yes.2017),ncol = 2))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  matrix(c(yes.2018, yes.2017, n.2018 - yes.2018, n.2017 - yes.2017),     ncol = 2)
## X-squared = 22.258, df = 1, p-value = 2.383e-06
# Note that  Chi-squared statistic and p-value are the same.

Example 13: One-sided confidence bounds

  1. Make a 95% lower confidence bound for the mean birth weights of babies born in Alaska
mytest.AK.lower <- t.test(girlsAK, conf.level = .95,
                    alternative = "greater")
mytest.AK.lower
## 
##  One Sample t-test
## 
## data:  girlsAK
## t = 38.421, df = 39, p-value < 2.2e-16
## alternative hypothesis: true mean is greater than 0
## 95 percent confidence interval:
##  3362.147      Inf
## sample estimates:
## mean of x 
##   3516.35
  1. Make a 99% lower confidence bound for the difference of mean birth weights (Alaska - Wyoming)
mytest.diff.lower = t.test(Weight ~ State, data = Girls2004, 
                           alternative = "greater", conf.level = .99)

mytest.diff.lower$conf.int
## [1] 39.69792      Inf
## attr(,"conf.level")
## [1] 0.99
  1. Make a 95% upper confidence bound for approval rate for Congress in October 2018
prop.test(yes.2018, n.2018, alternative = "less")$conf.int
## [1] 0.0000000 0.2318109
## attr(,"conf.level")
## [1] 0.95

Simple Linear Regression

Example 1: From the notes

library(ISLR)
library(MASS)
ad=read.csv("advertising.csv")
head(ad)
##      TV Radio Newspaper Sales
## 1 230.1  37.8      69.2  22.1
## 2  44.5  39.3      45.1  10.4
## 3  17.2  45.9      69.3  12.0
## 4 151.5  41.3      58.5  16.5
## 5 180.8  10.8      58.4  17.9
## 6   8.7  48.9      75.0   7.2
fit=lm(Sales ~ TV+Radio+Newspaper, data=ad)
summary(fit)
## 
## Call:
## lm(formula = Sales ~ TV + Radio + Newspaper, data = ad)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.3034 -0.8244 -0.0008  0.8976  3.7473 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.6251241  0.3075012  15.041   <2e-16 ***
## TV          0.0544458  0.0013752  39.592   <2e-16 ***
## Radio       0.1070012  0.0084896  12.604   <2e-16 ***
## Newspaper   0.0003357  0.0057881   0.058    0.954    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.662 on 196 degrees of freedom
## Multiple R-squared:  0.9026, Adjusted R-squared:  0.9011 
## F-statistic: 605.4 on 3 and 196 DF,  p-value: < 2.2e-16
fit2=lm(Sales ~ TV+Radio, data=ad)
summary(fit2)
## 
## Call:
## lm(formula = Sales ~ TV + Radio, data = ad)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.3131 -0.8269  0.0095  0.9022  3.7484 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.630879   0.290308   15.95   <2e-16 ***
## TV          0.054449   0.001371   39.73   <2e-16 ***
## Radio       0.107175   0.007926   13.52   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.657 on 197 degrees of freedom
## Multiple R-squared:  0.9026, Adjusted R-squared:  0.9016 
## F-statistic: 912.7 on 2 and 197 DF,  p-value: < 2.2e-16
Cr_data=  ISLR::Credit
head(Cr_data)
##   ID  Income Limit Rating Cards Age Education Gender Student Married Ethnicity
## 1  1  14.891  3606    283     2  34        11   Male      No     Yes Caucasian
## 2  2 106.025  6645    483     3  82        15 Female     Yes     Yes     Asian
## 3  3 104.593  7075    514     4  71        11   Male      No      No     Asian
## 4  4 148.924  9504    681     3  36        11 Female      No      No     Asian
## 5  5  55.882  4897    357     2  68        16   Male      No     Yes Caucasian
## 6  6  80.180  8047    569     4  77        10   Male      No      No Caucasian
##   Balance
## 1     333
## 2     903
## 3     580
## 4     964
## 5     331
## 6    1151
fit3 = lm(Balance ~ Gender, data=Cr_data)
summary(fit3)
## 
## Call:
## lm(formula = Balance ~ Gender, data = Cr_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -529.54 -455.35  -60.17  334.71 1489.20 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    509.80      33.13  15.389   <2e-16 ***
## GenderFemale    19.73      46.05   0.429    0.669    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 460.2 on 398 degrees of freedom
## Multiple R-squared:  0.0004611,  Adjusted R-squared:  -0.00205 
## F-statistic: 0.1836 on 1 and 398 DF,  p-value: 0.6685
contrasts(Cr_data$Gender)
##        Female
##  Male       0
## Female      1
fit4 = lm(Balance ~ Ethnicity, data=Cr_data)
summary(fit4)
## 
## Call:
## lm(formula = Balance ~ Ethnicity, data = Cr_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -531.00 -457.08  -63.25  339.25 1480.50 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          531.00      46.32  11.464   <2e-16 ***
## EthnicityAsian       -18.69      65.02  -0.287    0.774    
## EthnicityCaucasian   -12.50      56.68  -0.221    0.826    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 460.9 on 397 degrees of freedom
## Multiple R-squared:  0.0002188,  Adjusted R-squared:  -0.004818 
## F-statistic: 0.04344 on 2 and 397 DF,  p-value: 0.9575
contrasts(Cr_data$Ethnicity)
##                  Asian Caucasian
## African American     0         0
## Asian                1         0
## Caucasian            0         1

Example 2

library(ISLR)
library(MASS)
names(Boston)
##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"
?Boston
plot(medv~lstat,data=Boston)

lm.fit=lm(medv~lstat,data=Boston)

plot(medv~lstat,data=Boston)
abline(lm.fit, col="red")

#or
attach(Boston)
lm.fit=lm(medv~lstat)


summary(lm.fit)
## 
## Call:
## lm(formula = medv ~ lstat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.168  -3.990  -1.318   2.034  24.500 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 34.55384    0.56263   61.41   <2e-16 ***
## lstat       -0.95005    0.03873  -24.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.5432 
## F-statistic: 601.6 on 1 and 504 DF,  p-value: < 2.2e-16
names(lm.fit)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"
coef(lm.fit)
## (Intercept)       lstat 
##  34.5538409  -0.9500494
confint(lm.fit)# CI
##                 2.5 %     97.5 %
## (Intercept) 33.448457 35.6592247
## lstat       -1.026148 -0.8739505
plot(lm.fit$residuals)

plot(lm.fit)

Example 3

Data Science Question: Is the variable “energy” contributes for predicting “danceability” of the songs in for both musicians; Taylor Swift and Beyonce?

Danceability: Danceability describes how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity. A value of 0.0 is least danceable and 1.0 is most danceable.

Artists <- read.csv("Artists.csv",header = TRUE)

Dancebilty and Energy of songs by Taylor Swift

set.seed(12)
library(caret)
## Loading required package: lattice
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ✔ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ✖ dplyr::select() masks MASS::select()
Artists1 <-Artists #make a copy

ts<- Artists1[Artists1$artist_name=="Taylor Swift",]
names(ts)
##  [1] "X"                  "artist_name"        "Valence"           
##  [4] "danceability"       "energy"             "loudness"          
##  [7] "speechiness"        "acousticness"       "liveness"          
## [10] "tempo"              "track_name"         "album_name"        
## [13] "album_release_year"
training.samples <- ts$danceability %>%
  createDataPartition(p = 0.8, list = FALSE)

train.data  <- ts[training.samples, ]
test.data <- ts[-training.samples, ]
dim(train.data)
## [1] 960  13
fit1 =lm(danceability~energy, data=train.data)
summary(fit1)
## 
## Call:
## lm(formula = danceability ~ energy, data = train.data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.42627 -0.06289  0.00801  0.06838  0.29897 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.61000    0.01059  57.629   <2e-16 ***
## energy      -0.03272    0.01749  -1.871   0.0616 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1108 on 958 degrees of freedom
## Multiple R-squared:  0.003642,   Adjusted R-squared:  0.002602 
## F-statistic: 3.501 on 1 and 958 DF,  p-value: 0.06162

P-value is large, so at 5% significance level it seems like there is not a significant relationship between energy and Dancebilty in Taylor Swift’s songs.

The F statistic is small, also indicating there is no significant relationship between the predictor and the response.

Since \(R^2 \approx 0.002\), indicating the relationship is not strong. Since the estimated slope is negative, so is the relationship.

Dancebilty and Energy of songs by Beyonce

set.seed(12)
library(caret)

by<- Artists1[Artists1$artist_name=="Beyoncé",]
names(by)
##  [1] "X"                  "artist_name"        "Valence"           
##  [4] "danceability"       "energy"             "loudness"          
##  [7] "speechiness"        "acousticness"       "liveness"          
## [10] "tempo"              "track_name"         "album_name"        
## [13] "album_release_year"
training.samples <- by$danceability %>%
  createDataPartition(p = 0.8, list = FALSE)

train.data2  <- by[training.samples, ]
test.data2 <- by[-training.samples, ]
dim(train.data2)
## [1] 537  13
fit2 =lm(danceability~energy, data=train.data2)
summary(fit2)
## 
## Call:
## lm(formula = danceability ~ energy, data = train.data2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.55779 -0.11707  0.00723  0.10903  0.34103 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.47185    0.02337  20.189  < 2e-16 ***
## energy       0.15401    0.03558   4.328 1.79e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1662 on 535 degrees of freedom
## Multiple R-squared:  0.03383,    Adjusted R-squared:  0.03203 
## F-statistic: 18.74 on 1 and 535 DF,  p-value: 1.793e-05

P-value is very small, so at 5% significance level, there is a significant relationship between energy and Dancebilty in Taylor Swift’s songs.

The F statistic is large, also indicating there is a significant relationship between the predictor and the response.

Finally, our model equation can be written as follow:

\[dancebility = 0.47 + 0.15*energy \]

The confidence interval of the model coefficient can be extracted as follow:

confint(fit2)
##                  2.5 %    97.5 %
## (Intercept) 0.42593927 0.5177620
## energy      0.08411477 0.2239069

Model accuracy assessment:

As we have seen in simple linear regression, the overall quality of the model can be assessed by examining the R-squared (R2) and Residual Standard Error (RSE).

R-squared:

R^2 represents the proportion of variance, in the outcome variable y. An R2 value close to 1 indicates that the model explains a large portion of the variance in the outcome variable.

A problem with the R^2, is that, it will always increase when more variables are added to the model, even if those variables are only weakly associated with the response (James et al. 2014).

A solution is to adjust the R^2 by taking into account the number of predictor variables.

The adjustment in the “Adjusted R Square” value in the summary output is a correction for the number of x variables included in the prediction model.

Residual Standard Error (RSE) or sigma(“residual standard deviation”):

The RSE estimate gives a measure of error of prediction. The lower the RSE, the more accurate the model.

Predictions

predictions <- fit1 %>% predict(test.data)
#or
y_hat <- predict(fit1, test.data)
predictions2 <- fit2 %>% predict(test.data2)
#or
y_hat2 <- predict(fit2, test.data2)

Model performance

  1. Model 1
# (a) Prediction error, RMSE (Root mean square error)
RMSE(predictions, test.data$danceability) # RMSE{caret package}
## [1] 0.107848
# (b) R-square ## R2{caret package}
R2(predictions, test.data$danceability)
## [1] 0.01209772
  1. Model 2
# (a) Prediction error, RMSE (Root mean square error)
RMSE(predictions2, test.data2$danceability) # RMSE{caret package}
## [1] 0.1691094
# (b) R-square ## R2{caret package}
R2(predictions2, test.data2$danceability)
## [1] 0.04044137

** The prediction error RMSE of the model 1 is lower than the prediction error of the model2. Note that we are comparing completely different 2 models. If this is about the same model with different set of variables, this comparison will make sense.